Clearing Global Environment

rm(list = ls())

Installing Packages

options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("tidyr")
## 
## The downloaded binary packages are in
##  /var/folders/y4/1qsgv9rs22gf6l_qrwz2tbf40000gn/T//RtmphdQhrw/downloaded_packages
install.packages("plotly")
## 
## The downloaded binary packages are in
##  /var/folders/y4/1qsgv9rs22gf6l_qrwz2tbf40000gn/T//RtmphdQhrw/downloaded_packages

Libraries

library("readr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(plotly)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Importing Data

calendar = read_csv('../data/calendar.csv')
## Rows: 1969 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): weekday, event_name_1, event_type_1, event_name_2, event_type_2
## dbl  (7): wm_yr_wk, wday, month, year, snap_CA, snap_TX, snap_WI
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(calendar)
## # A tibble: 6 × 13
##   date       wm_yr_wk weekday    wday month  year event_name_1 event_type_1
##   <date>        <dbl> <chr>     <dbl> <dbl> <dbl> <chr>        <chr>       
## 1 2011-01-29    11101 Saturday      1     1  2011 <NA>         <NA>        
## 2 2011-01-30    11101 Sunday        2     1  2011 <NA>         <NA>        
## 3 2011-01-31    11101 Monday        3     1  2011 <NA>         <NA>        
## 4 2011-02-01    11101 Tuesday       4     2  2011 <NA>         <NA>        
## 5 2011-02-02    11101 Wednesday     5     2  2011 <NA>         <NA>        
## 6 2011-02-03    11101 Thursday      6     2  2011 <NA>         <NA>        
## # ℹ 5 more variables: event_name_2 <chr>, event_type_2 <chr>, snap_CA <dbl>,
## #   snap_TX <dbl>, snap_WI <dbl>
sales_train_validation = read_csv('../data/sales_train_validation.csv')
## Rows: 30490 Columns: 1918
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (5): item_id, dept_id, cat_id, store_id, state_id
## dbl (1913): d_1, d_2, d_3, d_4, d_5, d_6, d_7, d_8, d_9, d_10, d_11, d_12, d...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sales_train_validation)
## # A tibble: 6 × 1,918
##   item_id   dept_id cat_id store_id state_id   d_1   d_2   d_3   d_4   d_5   d_6
##   <chr>     <chr>   <chr>  <chr>    <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## 2 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## 3 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## 4 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## 5 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## 6 HOBBIES_… HOBBIE… HOBBI… CA_1     CA           0     0     0     0     0     0
## # ℹ 1,907 more variables: d_7 <dbl>, d_8 <dbl>, d_9 <dbl>, d_10 <dbl>,
## #   d_11 <dbl>, d_12 <dbl>, d_13 <dbl>, d_14 <dbl>, d_15 <dbl>, d_16 <dbl>,
## #   d_17 <dbl>, d_18 <dbl>, d_19 <dbl>, d_20 <dbl>, d_21 <dbl>, d_22 <dbl>,
## #   d_23 <dbl>, d_24 <dbl>, d_25 <dbl>, d_26 <dbl>, d_27 <dbl>, d_28 <dbl>,
## #   d_29 <dbl>, d_30 <dbl>, d_31 <dbl>, d_32 <dbl>, d_33 <dbl>, d_34 <dbl>,
## #   d_35 <dbl>, d_36 <dbl>, d_37 <dbl>, d_38 <dbl>, d_39 <dbl>, d_40 <dbl>,
## #   d_41 <dbl>, d_42 <dbl>, d_43 <dbl>, d_44 <dbl>, d_45 <dbl>, d_46 <dbl>, …
sales_test_validation = read_csv('../data/sales_test_validation.csv')
## Rows: 30490 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): item_id, dept_id, cat_id, store_id, state_id
## dbl (28): d_1914, d_1915, d_1916, d_1917, d_1918, d_1919, d_1920, d_1921, d_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sales_test_validation)
## # A tibble: 6 × 33
##   item_id    dept_id cat_id store_id state_id d_1914 d_1915 d_1916 d_1917 d_1918
##   <chr>      <chr>   <chr>  <chr>    <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            0      0      0      2      0
## 2 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            0      1      0      0      0
## 3 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            0      0      1      1      0
## 4 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            0      0      1      2      4
## 5 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            1      0      2      3      1
## 6 HOBBIES_1… HOBBIE… HOBBI… CA_1     CA            0      0      1      0      0
## # ℹ 23 more variables: d_1919 <dbl>, d_1920 <dbl>, d_1921 <dbl>, d_1922 <dbl>,
## #   d_1923 <dbl>, d_1924 <dbl>, d_1925 <dbl>, d_1926 <dbl>, d_1927 <dbl>,
## #   d_1928 <dbl>, d_1929 <dbl>, d_1930 <dbl>, d_1931 <dbl>, d_1932 <dbl>,
## #   d_1933 <dbl>, d_1934 <dbl>, d_1935 <dbl>, d_1936 <dbl>, d_1937 <dbl>,
## #   d_1938 <dbl>, d_1939 <dbl>, d_1940 <dbl>, d_1941 <dbl>
sell_prices = read_csv('../data/sell_prices.csv')
## Rows: 6841121 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): store_id, item_id
## dbl (2): wm_yr_wk, sell_price
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sell_prices)
## # A tibble: 6 × 4
##   store_id item_id       wm_yr_wk sell_price
##   <chr>    <chr>            <dbl>      <dbl>
## 1 CA_1     HOBBIES_1_001    11325       9.58
## 2 CA_1     HOBBIES_1_001    11326       9.58
## 3 CA_1     HOBBIES_1_001    11327       8.26
## 4 CA_1     HOBBIES_1_001    11328       8.26
## 5 CA_1     HOBBIES_1_001    11329       8.26
## 6 CA_1     HOBBIES_1_001    11330       8.26
weights_validation = read_csv('../data/weights_validation.csv')
## Rows: 42840 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Level_id, Agg_Level_1, Agg_Level_2
## dbl (2): Dollar_Sales, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(weights_validation)
## # A tibble: 6 × 5
##   Level_id Agg_Level_1   Agg_Level_2 Dollar_Sales     weight
##   <chr>    <chr>         <chr>              <dbl>      <dbl>
## 1 Level12  HOBBIES_1_001 CA_1              225.   0.00006   
## 2 Level12  HOBBIES_1_002 CA_1                7.94 0.00000212
## 3 Level12  HOBBIES_1_003 CA_1               47.5  0.0000127 
## 4 Level12  HOBBIES_1_004 CA_1              237.   0.0000631 
## 5 Level12  HOBBIES_1_005 CA_1              109.   0.0000292 
## 6 Level12  HOBBIES_1_006 CA_1               16.3  0.00000435

Data Aggregation

# Joined data into a single data frame
# Pivot single data frame so dates become rows
# Aggregate to category/day
# Make date format the same as calendar

pivoted_sales_train_validation <- sales_train_validation %>%
  pivot_longer(
    cols = starts_with("d_"),  # Select columns starting with 'd_'
    names_to = "day",          # New column name for the pivoted columns
    values_to = "value"        # New column name for the values in the selected columns
  ) %>%
mutate(day = as.numeric(gsub("d_", "", day)))
grouped_data <- pivoted_sales_train_validation %>%
  group_by(cat_id, day) %>%
  summarize(total = sum(value))
## `summarise()` has grouped output by 'cat_id'. You can override using the
## `.groups` argument.
# Add day column to calendar data frame
calendar$day <- 1:nrow(calendar)
# left join data 
grouped_data <- left_join(grouped_data, calendar, by = "day")
# Holiday list variables
influential_holidays = list(
      #Thanksgiving 2011-2015
      list(
        type = "line",
        x0 = as.Date("2011-11-24"), x1 = as.Date("2011-11-24"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "brown", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2012-11-22"), x1 = as.Date("2012-11-22"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "brown", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2013-11-28"), x1 = as.Date("2013-11-28"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "brown", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2014-11-27"), x1 = as.Date("2014-11-27"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "brown", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2015-11-26"), x1 = as.Date("2015-11-26"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "brown", dash = "dash")               # Set line color and style
      ),
      #Christmas 2011-2015
      list(
        type = "line",
        x0 = as.Date("2011-12-25"), x1 = as.Date("2011-12-25"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "red", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2012-12-25"), x1 = as.Date("2012-12-25"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "red", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2013-12-25"), x1 = as.Date("2013-12-25"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "red", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2014-12-25"), x1 = as.Date("2014-12-25"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "red", dash = "dash")               # Set line color and style
      ),
      list(
        type = "line",
        x0 = as.Date("2015-12-25"), x1 = as.Date("2015-12-25"),  # x-coordinate for the vertical line
        y0 = 0, y1 = 40000 * 1.1,                               # Extend from y = 0 to y above the max value
        line = list(color = "red", dash = "dash")               # Set line color and style
      )
    )
# plot data
quantity_plot <- plot_ly(grouped_data, x = ~as.Date(date), y = ~total, color = ~cat_id, type='scatter', mode = 'lines') %>%
  layout(title = "Walmart Product Data",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Quantity"),
         shapes = influential_holidays)
quantity_plot

```{r}

overlayed_plot <- quantity_plot %>%

add_lines(x = rep(as.Date(“2015-03-01”), 2), # Repeated x value for vertical line

y = c(0, 40000 * 1.1), # From 0 to y above max value

line = list(color = ‘red’, dash = ‘dash’), # Set color and dash style

name = “Event”) # Add name to the legend

# Display the plot

overlayed_plot

```